
# Loading Regex library
library(qdapRegex)
Attaching package: ‘qdapRegex’
The following object is masked from ‘package:dplyr’:
explain
The following object is masked from ‘package:ggplot2’:
%+%
# Extract tweet text from climate dataset
twt_txt <- climate_twts$text
head(twt_txt)
[1] "I don't think it's too much of an exaggeration to say everyone's fate on the planet probably hinges on the dems winning these 2 Georgia Senate seats the science is clear https://t.co/qmFkKqOwC2 https://t.co/M04ZS22LB0"
[2] "BOM and CSIRO State of the Climate 2020 shows Australia is experiencing climate change now\nhttps://t.co/HdxntH6ziX"
[3] "\"[this] “offensive inquisitiveness” whose goal of humiliating others [is] inherently objectionable... “gossip derived from malicious judgment of others” could only “cast a shadow of worthlessness” over humanity + climate of rage ... inimical to civil peace and social progress.\""
[4] "Trashing Labor on Climate policy,\nHas Michelle looked at the government??\n\nApparently it’s ok to question China on everything but it’s not ok to question thing’s attacks on US democracy.\n\nhttps://t.co/Djschiqc9g"
[5] "@BigDuke6__ @djmirk @CytometerMan @DPWIMM @realDonaldTrump @ChanelRion @OANN Funny, I thought that was climate change??"
[6] "@timinmitcham Expect thousands of climate change refugees to seek asylum!"
# Remove URLs from the tweet text
twt_txt_url <- rm_twitter_url(twt_txt)
# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs <- gsub("[^A-Za-z]"," " , twt_txt_url)
# Loading text mining library
library(tm)
Loading required package: NLP
Attaching package: ‘NLP’
The following object is masked from ‘package:ggplot2’:
annotate
# Convert text in "twt_gsub" dataset to a text corpus
twt_corpus <- twt_txt_chrs %>%
VectorSource() %>%
Corpus()
# Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower)
transformation drops documents
# Remove English stop words from the corpus using SMART dictionary and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("smart"))
transformation drops documents
head(twt_corpus_stpwd$content)
[1] " don exaggeration fate planet hinges dems winning georgia senate seats science clear"
[2] "bom csiro state climate shows australia experiencing climate change "
[3] " offensive inquisitiveness goal humiliating inherently objectionable gossip derived malicious judgment cast shadow worthlessness humanity climate rage inimical civil peace social progress "
[4] "trashing labor climate policy michelle looked government apparently question china question thing attacks democracy "
[5] " bigduke djmirk cytometerman dpwimm realdonaldtrump chanelrion oann funny thought climate change "
[6] " timinmitcham expect thousands climate change refugees seek asylum "
# Remove additional spaces from the corpus
twt_corpus_spaces <- tm_map(twt_corpus_stpwd, stripWhitespace)
transformation drops documents
# Loading library for text analysis
library(qdap)
Loading required package: qdapDictionaries
Loading required package: qdapTools
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘qdapTools’
The following object is masked from ‘package:dplyr’:
id
Loading required package: RColorBrewer
Attaching package: ‘qdap’
The following objects are masked from ‘package:tm’:
as.DocumentTermMatrix, as.TermDocumentMatrix
The following object is masked from ‘package:NLP’:
ngrams
The following object is masked from ‘package:rtweet’:
%>%
The following object is masked from ‘package:forcats’:
%>%
The following object is masked from ‘package:stringr’:
%>%
The following object is masked from ‘package:dplyr’:
%>%
The following object is masked from ‘package:purrr’:
%>%
The following object is masked from ‘package:tidyr’:
%>%
The following object is masked from ‘package:tibble’:
%>%
The following objects are masked from ‘package:base’:
Filter, proportions
# Extract term frequencies for top 60 words and view output
termfreq <- freq_terms(twt_corpus_spaces, 60)
termfreq
# Create a vector of custom stop words
custom_stopwds <- c("amp", "ve", "don", "lo", "climate", "change")
# Remove custom stop words and create a refined corpus
corp_refined <- tm_map(twt_corpus_spaces, removeWords, custom_stopwds)
transformation drops documents
# Extract term frequencies for the top 25 words
termfreq_25w <- freq_terms(corp_refined, 25)
# Identify terms with more than 30 counts from the top 25 list
term30 <- subset(termfreq_25w, FREQ > 30)
# Barchart
term30 %>%
ggplot() +
aes(x = reorder(WORD, -FREQ), y = FREQ) +
geom_bar(stat = "identity", fill = "blue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create word cloud with 10 colors and max 30 words
wordcloud(corp_refined, max.words = 30,
colors = brewer.pal(10, "Dark2"),
scale=c(4,1), random.order = FALSE)
n too large, allowed maximum for palette Dark2 is 8
Returning the palette you asked for with that many colors

# Load libraries
library(topicmodels)
# Create a document term matrix (DTM) for *climate*
dtm_climate <- DocumentTermMatrix(corp_refined)
# Find the sum of word counts in each document
rowTotals <- apply(dtm_climate, 1, sum)
# Select rows with a row total greater than zero
dtm_climate_new <- dtm_climate[rowTotals > 0, ]
# Create a topic model with 10 topics
topicmodl_10 <- LDA(dtm_climate_new, k = 10)
# Select and view the top 10 terms in the topic model
top_10terms <- terms(topicmodl_10, 10)
top_10terms
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
[1,] "trump" "people" "action" "global" "time" "global" "work"
[2,] "crisis" "biden" "make" "biden" "covid" "biden" "people"
[3,] "biden" "action" "global" "action" "environment" "energy" "biden"
[4,] "snow" "science" "science" "oil" "green" "years" "energy"
[5,] "action" "energy" "biden" "world" "snow" "today" "policy"
[6,] "nov" "covid" "time" "australia" "world" "emissions" "world"
[7,] "oil" "health" "live" "crisis" "great" "people" "political"
[8,] "years" "make" "crisis" "big" "planet" "crisis" "crisis"
[9,] "government" "report" "years" "report" "global" "science" "good"
[10,] "green" "years" "good" "science" "pandemic" "trump" "future"
Topic 8 Topic 9 Topic 10
[1,] "biden" "people" "people"
[2,] "trump" "world" "world"
[3,] "action" "csiro" "biden"
[4,] "covid" "global" "justice"
[5,] "future" "control" "make"
[6,] "world" "time" "crisis"
[7,] "current" "great" "current"
[8,] "year" "lost" "future"
[9,] "government" "year" "emissions"
[10,] "carbon" "state" "health"
library(syuzhet)
Attaching package: ‘syuzhet’
The following object is masked from ‘package:rtweet’:
get_tokens
# Perform sentiment analysis for tweets on `ClimateCrisis`
sa.value <- get_nrc_sentiment(climate_twts$text)
`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m`group_by_()` is deprecated as of dplyr 0.7.0.
Please use `group_by()` instead.
See vignette('programming') for more help
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m`data_frame()` is deprecated as of tibble 1.1.0.
Please use `tibble()` instead.
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m
# View the sentiment scores
head(sa.value, 10)
# Calculate sum of sentiment scores
score <- colSums(sa.value[,])
# Convert the sum of scores to a data frame
score_df <- data.frame(score)
# Convert row names into 'sentiment' column and combine with sentiment scores
score_df2 <- cbind(sentiment = row.names(score_df),
score_df, row.names = NULL)
print(score_df2)
# Plot the sentiment scores
ggplot(data = score_df2, aes(x = sentiment, y = score, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(igraph)
# Extract source vertex and target vertex from the tweet data frame
rply_df <- climate_twts[, c("screen_name" , "reply_to_screen_name" )]
# Remove rows with missing values
rply_df_new <- rply_df[complete.cases(rply_df), ]
# Create a matrix
rply_matrx <- as.matrix(rply_df_new)
# Convert the matrix to a reply network
nw_rply <- graph_from_edgelist(el = rply_matrx, directed = TRUE)
# Calculate out-degree scores from the retweet network
out_degree <- degree(nw_rply, mode = c("out"))
# Sort the out-degree scores in decreasing order
out_degree_sort <- sort(out_degree, decreasing = TRUE)
# View users with the top 20 out-degree scores
out_degree_sort[1:20]
JimBuckley73 kgrandia erinbiba Climatehope2 CletusNwafor GuyRemorganised
44 37 16 14 14 13
NatBullard RebeccaElisabe3 NickThorsch KQ_VanCity bjames280961 eviyal
13 11 11 11 9 9
Richard09224278 _ppmv richardwakelin3 SurfTasmania verconnell BetelgeuseLxvi
8 8 8 8 8 8
climate_2017 beatulagan
7 6
# Compute the in-degree scores from the retweet network
in_degree <- degree(nw_rply, mode = c("in"))
# Sort the in-degree scores in decreasing order
in_degree_sort <- sort(in_degree, decreasing = TRUE)
# View users with the top 10 in-degree scores
in_degree_sort[1:10]
BorisJohnson senatemajldr afneil KamalaHarris seanhannity mcuban
179 74 70 63 57 51
ozm realDonaldTrump JoeBiden zerohedge
43 41 30 24
# Calculate the betweenness scores from the retweet network
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
betwn_nw <- betweenness(nw_rply, directed = TRUE)
# Sort betweenness scores in decreasing order and round the values
betwn_nw_sort <- betwn_nw %>%
sort(decreasing = TRUE) %>%
round()
# View users with the top 10 betweenness scores
betwn_nw_sort[1:10]
XRebellionUK bjames280961 RebeccaElisabe3 richardabetts Climatehope2 erinbiba
203 184 68 39 29 21
emilyhewertson hausfath T0myBarrient0s JesseLReynolds
20 19 18 14
nw_rply
IGRAPH 520fffe DN-- 9441 7121 --
+ attr: name (v/c), followers (v/c)
+ edges from 520fffe (vertex names):
[1] jamesmiller62 ->jamesmiller62 pjstack ->BigDuke6__
[3] 5b8c196ba3654e3->5b8c196ba3654e3 JimBuckley73 ->FinanceInCommon
[5] JimBuckley73 ->SenWarren JimBuckley73 ->9NewsSyd
[7] JimBuckley73 ->senatemajldr JimBuckley73 ->senatemajldr
[9] JimBuckley73 ->GOP JimBuckley73 ->SkyNewsAust
[11] JimBuckley73 ->StateDept JimBuckley73 ->TheDemocrats
[13] JimBuckley73 ->nyunggai JimBuckley73 ->StateDept
[15] JimBuckley73 ->SkyNewsAust JimBuckley73 ->ryanwmccormack
+ ... omitted several edges
library(maps)
# Extract geo-coordinates data to append as new columns
cc_coord <- lat_lng(climate_twts)
# Omit rows with missing geo-coordinates in the data frame
cc_geo <- na.omit(cc_coord[,c("lat", "lng")])
# Plot longitude and latitude values of tweets on UK
map(database = "world", region = "UK(?!r)", fill = TRUE, col = "light green")
with(cc_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))

# Plot longitude and latitude values of tweets on the world map
map(database = "world", fill = TRUE, col = "light green")
with(cc_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))

---
title: "R Notebook"
output: html_notebook
---

```{r message=FALSE}
# Load libraries
library(tidyverse)
library(httpuv)
library(rtweet)
library(readr)
library(here)
library(rjson)
```

```{r}

cimate_twts <- read_twitter_csv(here("raw_data/climate_twts.csv"))

```

```{r}
ts_plot(climate_twts, by = "hours", color = "blue")
```

```{r}
# Loading Regex library
library(qdapRegex)

# Extract tweet text from climate dataset
twt_txt <- climate_twts$text
head(twt_txt)

# Remove URLs from the tweet text
twt_txt_url <- rm_twitter_url(twt_txt)

# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs  <- gsub("[^A-Za-z]"," " , twt_txt_url)

# Loading text mining library
library(tm)

# Convert text in "twt_gsub" dataset to a text corpus
twt_corpus <- twt_txt_chrs %>% 
                VectorSource() %>% 
                Corpus() 

# Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower) 

# Remove English stop words from the corpus using SMART dictionary and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("smart"))
head(twt_corpus_stpwd$content)

# Remove additional spaces from the corpus
twt_corpus_spaces <- tm_map(twt_corpus_stpwd, stripWhitespace)

# Loading library for text analysis
library(qdap)

# Extract term frequencies for top 60 words and view output
termfreq  <-  freq_terms(twt_corpus_spaces, 60)
termfreq
```

```{r}
# Create a vector of custom stop words
custom_stopwds <- c("amp", "ve", "don", "lo", "climate", "change")

# Remove custom stop words and create a refined corpus
corp_refined <- tm_map(twt_corpus_spaces, removeWords, custom_stopwds) 

# Extract term frequencies for the top 25 words
termfreq_25w <- freq_terms(corp_refined, 25)

# Identify terms with more than 30 counts from the top 25 list
term30 <- subset(termfreq_25w, FREQ > 30)


# Barchart
term30 %>% 
ggplot() +
aes(x = reorder(WORD, -FREQ), y = FREQ) +
		geom_bar(stat = "identity", fill = "blue") + 
        theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r}
library(RColorBrewer)
library(wordcloud)

# Create word cloud with 10 colors and max 30 words
wordcloud(corp_refined, max.words = 30, 
    colors = brewer.pal(10, "Dark2"), 
    scale=c(4,1), random.order = FALSE)
```

```{r}

# Load libraries
library(topicmodels)


# Create a document term matrix (DTM) for *climate*
dtm_climate <- DocumentTermMatrix(corp_refined)

# Find the sum of word counts in each document
rowTotals <- apply(dtm_climate, 1, sum)

# Select rows with a row total greater than zero
dtm_climate_new <- dtm_climate[rowTotals > 0, ]

# Create a topic model with 10 topics
topicmodl_10 <- LDA(dtm_climate_new, k = 10)

# Select and view the top 10 terms in the topic model
top_10terms <- terms(topicmodl_10, 10)
top_10terms 


```

```{r}
library(syuzhet)

# Perform sentiment analysis for tweets on `climate` 
sa.value <- get_nrc_sentiment(climate_twts$text)

# View the sentiment scores
head(sa.value, 10)
```

```{r}
# Calculate sum of sentiment scores
score <- colSums(sa.value[,])

# Convert the sum of scores to a data frame
score_df <- data.frame(score)

# Convert row names into 'sentiment' column and combine with sentiment scores
score_df2 <- cbind(sentiment = row.names(score_df),  
				  score_df, row.names = NULL)
print(score_df2)

# Plot the sentiment scores
ggplot(data = score_df2, aes(x = sentiment, y = score, fill = sentiment)) +
  	 geom_bar(stat = "identity") +
       theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r message=FALSE}
library(igraph)

# Extract source vertex and target vertex from the tweet data frame
rply_df <- climate_twts[, c("screen_name" , "reply_to_screen_name" )]

# Remove rows with missing values
rply_df_new <- rply_df[complete.cases(rply_df), ]

# Create a matrix
rply_matrx <- as.matrix(rply_df_new)

# Convert the matrix to a reply network
nw_rply <- graph_from_edgelist(el = rply_matrx, directed = TRUE)

# Calculate out-degree scores from the retweet network
out_degree <- degree(nw_rply, mode = c("out"))

# Sort the out-degree scores in decreasing order
out_degree_sort <- sort(out_degree, decreasing = TRUE)

# View users with the top 20 out-degree scores
out_degree_sort[1:20]
```

```{r}
# Compute the in-degree scores from the retweet network
in_degree <- degree(nw_rply, mode = c("in"))

# Sort the in-degree scores in decreasing order
in_degree_sort <- sort(in_degree, decreasing = TRUE)

# View users with the top 10 in-degree scores
in_degree_sort[1:10]
```

```{r}
# Calculate the betweenness scores from the retweet network
betwn_nw <- betweenness(nw_rply, directed = TRUE)

# Sort betweenness scores in decreasing order and round the values
betwn_nw_sort <- betwn_nw %>%
                    sort(decreasing = TRUE) %>%
                    round()

# View users with the top 10 betweenness scores 
betwn_nw_sort[1:10]
```

```{r}
# Create a variable for out-degree
deg_out <- degree(nw_rply, mode = c("out"))
deg_out

# Amplify the out-degree values
vert_size <- (deg_out * 3)
# + 5

#users
user_cos <- users_data(climate_twts)


# Create a column and categorize follower counts above and below 500
user_cos$follow <- ifelse(user_cos$followers_count > 500, "1", "0")

# Assign the new column as vertex attribute to the retweet network
V(nw_rply)$followers <- user_cos$follow
vertex_attr(nw_rply)

# Set the vertex colors based on follower count and create a plot
sub_color <- c("lightgreen", "tomato")

plot(nw_rply, asp = 9/12,
     vertex.size = deg_out, edge.arrow.size = 0.5,
     vertex.label.cex = 0.8,
     vertex.color = sub_color[as.factor(vertex_attr(nw_rply, "followers"))],
     vertex.label.color = "black", vertex.frame.color = "grey")
```

```{r}

library(maps)

# Extract geo-coordinates data to append as new columns
cc_coord <- lat_lng(climate_twts)

# Omit rows with missing geo-coordinates in the data frame
cc_geo <- na.omit(cc_coord[,c("lat", "lng")])

# Plot longitude and latitude values of tweets on UK map
map(database = "world", region = "UK(?!r)", fill = TRUE, col = "light green")
with(cc_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))

# Plot longitude and latitude values of tweets on the world map
map(database = "world", fill = TRUE, col = "light green")
with(cc_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))
```

